Script for exploratory analysis of microbial genomes. The data directory should contain the necessary CSV files. Please adjust the pathways below to match your system.
import warnings
from Bio import SeqIO
from Bio.SeqUtils import GC
import gzip
import glob
import os
import os.path
import subprocess as spc
from pathlib import Path
from collections import defaultdict
import csv
import numpy as np
import scipy as sp
import pprint
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures
from patsy import dmatrix
import seaborn as sns
import pandas as pd
import pandas_profiling
from fitter import Fitter
from sklearn.cluster import KMeans
import matplotlib
matplotlib.style.use('seaborn-white')
import matplotlib.pyplot as plt
sns.set_style("ticks")
sns.set_context("notebook", font_scale=1.5)
%matplotlib inline
%config InlineBackend.figure_format = 'retina' ## 2x plot resolution
## --------------- Procedure selection -------------------------
## These of course need to be set to True at least once
download_data = False
create_directories = False
initialize_csv_files = False
create_csv_files = False
## This will download another "assembly_summary.txt" file that will replace the one for bacteria.
## Download separately or save to a different directory
download_phages = False
create_phage_directory = False
initialize_phage_csv_file = False
create_phage_csv_file = False
'''
Script to download all complete microbial genomes from NCBI genbank. All commands are unix/linux shell commands
and can be copied and pasted directly into command prompt if someone wishes to do so. Please be aware, that ncbi
will temporarily block the connection if they deside that there is overuse of the server. They recommend no more
that three requests per second and use of the weekends or holidays (US) for such a purpose.
Thanks to 5heikki at Biostars.org for code recommendations
'''
if download_data:
'''Set the environment. Change these to your liking'''
my_path = ('/home/atol/Documents/GC_Content/Data')
assembly_file = Path(str(my_path))
store_path = Path('/home/atol/Documents/GC_Content/Data/Bacteria')
'''
Get the list of assemblies if not already present (it might be a good idea
to delete "assembly_summary.txt" after finishing with fetching the data,
so a fresh list can be downloded at a later time):
'''
if not os.path.isfile(my_path + 'assembly_summary.txt'):
my_command = ("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt")
spc.call(my_command, cwd=my_path, shell=True)
'''Parse the addresses of complete microbial genomes:'''
parse_command = ('''awk -F '\t' '{if($12=="Complete Genome") print $20}' assembly_summary.txt > assembly_summary_complete_genomes.txt''')
spc.call(parse_command, cwd=my_path, shell=True)
'''Make a dir for storing the data'''
if not os.path.isdir(str(store_path)):
spc.call('mkdir Bacteria', cwd=my_path, shell=True)
'''
Fetch the data.
The download will send three requests per second (--wait=seconds, as suggested
by ncbi to avoid abusing the server) and will resume from the last file the
next time, if for some reason the connection will be lost.
TODO: progress indication
'''
spc.call('''for next in $(cat assembly_summary_complete_genomes.txt);
do wget --wait=0.3 --continue -P Bacteria "$next"/*genomic.fna.gz
done''', cwd=my_path, shell=True)
'''
The above wget command will also fetch files named:
..cds_from_genomic.fna.gz
..rna_from_genomic.fna.gz.
We don't want these, so they must be removed after the downloading finishes:
'''
spc.call('''rm -v Bacteria/*cds_from_genomic.fna.gz
Bacteria/*cds_from_genomic.fna.gz''', cwd=my_path, shell=True)
# -------------------------------------------------------------------------------------
# Download phages
if download_phages:
my_path = ('/home/atol/Documents/GC_Content/Data')
assembly_file = Path(str(my_path))
store_path = Path('/home/atol/Documents/GC_Content/Data/Phages')
if not os.path.isfile(my_path + 'assembly_summary_complete_phages.txt'):
phage_command = ("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/assembly_summary.txt")
spc.call(phage_command, cwd=my_path, shell=True)
'''Parse the addresses of complete phage genomes:'''
parse_command = ('''awk -F '\t' '{if(($0 ~ /phage/) && ($0 ~ /Complete/)) print $20}' assembly_summary.txt > assembly_summary_complete_phages.txt''')
spc.call(parse_command, cwd=my_path, shell=True)
'''Make a dir for storing the data'''
if not os.path.isdir(str(store_path)):
spc.call('mkdir Phages', cwd=my_path, shell=True)
'''
Fetch the data.
The download will send three requests per second (--wait=seconds, as suggested
by ncbi to avoid abusing the server) and will resume from the last file the
next time, if for some reason the connection will be lost.
TODO: progress indication
'''
spc.call('''for next in $(cat assembly_summary_complete_phages.txt);
do wget --wait=0.3 --continue -P Phages "$next"/*genomic.fna.gz
done''', cwd=my_path, shell=True)
'''
The above wget command will also fetch files named:
..cds_from_genomic.fna.gz
..rna_from_genomic.fna.gz.
We don't want these, so they must be removed after the downloading finishes:
'''
spc.call('''rm -v Phages/*cds_from_genomic.fna.gz
Phages/*cds_from_genomic.fna.gz''', cwd=my_path, shell=True)
if create_directories:
## Data directory. Change this according to your system
my_directory = ('/home/atol/Documents/GC_Content/Data')
## Place where sequences are stored
data_dir = (my_directory + '/Bacteria/')
phage_data_dir = (my_directory + '/Phages/')
## Directory paths to store generated csv and image files
image_path = (my_directory + '/Images')
scv_path = (my_directory + '/CSV')
## Check if directories exist and if not, create them
try:
os.makedirs(image_path)
os.makedirs(scv_path)
except OSError:
if not os.path.isdir(image_path) or not os.path.isdir(scv_path):
raise
##------------ Initialize the csv files -----------------------
if initialize_csv_files:
## The whole dataset
with open('CSV/MyGC.csv', 'wb') as f_ini:
headers = ['ID',
'Description',
'Length',
'GC']
writer = csv.writer(f_ini, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(headers)])
## Complete genomes
with open('CSV/MyGC_genome.csv', 'wb') as f_ini:
headers = ['ID',
'Description',
'Length',
'GC']
writer = csv.writer(f_ini, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(headers)])
## Plasmids
with open('CSV/MyGC_plasmid.csv', 'wb') as f_ini:
headers = ['ID',
'Description',
'Length',
'GC']
writer = csv.writer(f_ini, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(headers)])
## There some phages in the bacteria dataset
## Phage
with open('CSV/MyGC_phage.csv', 'wb') as f_ini:
headers = ['ID',
'Description',
'Length',
'GC']
writer = csv.writer(f_ini, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(headers)])
## ------------------ Phages dataset ----------------------------
if initialize_phage_csv_file:
## Complete phage genomes
with open('CSV/MyGC_phage2.csv', 'wb') as f_ini:
headers = ['ID',
'Description',
'Length',
'GC']
writer = csv.writer(f_ini, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(headers)])
if create_csv_files:
## --------------------- Extract the files -------------------------
for gzip_path in glob.glob("%s/*.fna.gz" % data_dir):
if not os.path.isdir(gzip_path):
with gzip.open(gzip_path, 'rb') as in_file:
s = in_file.read()
# Now store the uncompressed data
path_to_store = gzip_path[:-3] # remove the '.gz' from the filename
# store uncompressed file data from 's' variable
with open(path_to_store, 'w') as g:
g.write(' '.join(s.split(','))) ## remove any commas from Description
## ----------- Write the entries into separate csv's ------------------------
"""
The code below can be more elegant and short. It is my opinion though,
it is more readable the way it is, and that's what counts, as long as
it does not affect performance.
"""
## ----------- Select the complete entries ---------------------------
records = list(SeqIO.parse(path_to_store, "fasta"))
with open('CSV/MyGC.csv', 'a+') as f:
for record in records:
record.description = ' '.join(record.description.split()[1:])
rec = [record.id,
record.description,
str(len(record)),
str(GC(record.seq))]
## Filter for incomplete entries and write
if "complete" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
## ----------- Select the gemomes -------------------------------------------
with open('CSV/MyGC_genome.csv', 'a+') as f:
for record in records:
rec = [record.id,
record.description,
str(len(record)),
str(GC(record.seq))]
## Filter for incomplete entries and write
if "genome" in record.description and len(record) > plasmid_length_threshold: ## Check for plasmids with wrong annotation
if "complete" in record.description:
if not "plasmid" in record.description:
if not "phage" in record.description:
if not "putative" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
## ------------- Select plasmids from the annotation ------------------------
with open('CSV/MyGC_plasmid.csv', 'a+') as f:
for record in records:
rec = [record.id,
record.description,
str(len(record)),
str(GC(record.seq))]
## Filter for incomplete entries and write
if "plasmid" in record.description:
if "complete" in record.description:
if not "putative" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
## Account for misannotated entries
elif "genome" in record.description and (len(record) < plasmid_length_threshold):
if "complete" in record.description:
if not "putative" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
## ------------ Select phages ------------------------------------------------
with open('CSV/MyGC_phage.csv', 'a+') as f:
for record in records:
rec = [record.id,
record.description,
str(len(record)),
str(GC(record.seq))]
## Filter for incomplete entries and write
if "phage" in record.description:
if "complete" in record.description:
if not "putative" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
if create_phage_csv_file:
## --------------------- Extract the files -------------------------
for gzip_path in glob.glob("%s/*.fna.gz" % phage_data_dir):
if not os.path.isdir(gzip_path):
with gzip.open(gzip_path, 'rb') as in_file:
s = in_file.read()
# Now store the uncompressed data
path_to_store = gzip_path[:-3] # remove the '.gz' from the filename
# store uncompressed file data from 's' variable
with open(path_to_store, 'w') as g:
g.write(' '.join(s.split(','))) ## remove any commas from Description
## ----------- Select the complete entries ---------------------------
records = list(SeqIO.parse(path_to_store, "fasta"))
with open('CSV/MyGC_phage2.csv', 'a+') as f:
for record in records:
record.description = ' '.join(record.description.split()[1:])
rec = [record.id,
record.description,
str(len(record)),
str(GC(record.seq))]
## Filter for incomplete entries and write
if "complete" in record.description:
if not "protein" in record.description:
if not "segment" in record.description:
writer = csv.writer(f, delimiter=' ',
quoting=csv.QUOTE_NONE,
escapechar=" ",
doublequote=False)
writer.writerow(['\t'.join(rec)])
Data directory. Change these accordingly
my_directory = ('/home/atol/Documents/GC_Content/Data')
image_path = (my_directory + '/Images')
csv_path = (my_directory + '/CSV')
The directory "CSV" (at this point) should contain five csv files:
MyGC.csv (Every sequence except the bulk of phages)
MyGC_genomes.csv (Genomic data only)
MyGC_plasmid.csv (Plasmids)
MyGC_phage.csv (Phage data included in 'genomes')
MyGC_phage2.csv (Phage data from /viral directory of NCBI)
Later, we will merge the two phage csv files into one that will be used for the analysis.
From these files, we create separate dataframes with Pandas.
MyGC_sep = pd.read_csv(csv_path + '/MyGC.csv', sep='\t', index_col='ID')
MyGC_genome = pd.read_csv(csv_path + '/MyGC_genome.csv', sep='\t', index_col='ID')
MyGC_plasmid = pd.read_csv(csv_path + '/MyGC_plasmid.csv', sep='\t', index_col='ID')
MyGC_phage1 = pd.read_csv(csv_path + '/MyGC_phage.csv', sep='\t', index_col='ID')
MyGC_phage2 = pd.read_csv(csv_path + '/MyGC_phage2.csv', sep='\t', index_col='ID')
## Merge the two phage datasets and remove duplicates
## It is impossible to use the index to drop duplicates, so we will add index as column,
## remove duples and then remove that column as well
merged = MyGC_phage2.append(MyGC_phage1)
merged['ID'] = merged.index
MyGC_phages = merged.drop_duplicates(subset='ID', keep='first')
del MyGC_phages['ID']
MyGC_phages.to_csv(csv_path + '/MyGC_phages.csv', sep='\t') # Save into a csv for reference
## Create lists so it is easier to iterate through
merged2_phages = pd.read_csv(csv_path + '/MyGC_phages.csv', sep='\t', index_col='ID').values.tolist()
MyGC_genome2 = pd.read_csv(csv_path + '/MyGC_genome.csv', sep='\t', index_col='ID').values.tolist()
print 'Entries in the whole dataset: ', len(MyGC_sep)
print 'Entries in "genome" set: ', len(MyGC_genome)
print 'Entries in "plasmid" set: ', len(MyGC_plasmid)
print 'Entries in "phage" set: ', len(MyGC_phage1)
print 'Entries in "phage2" set: ', len(MyGC_phage2)
print 'Entries in "phages" set: ', len(MyGC_phages)
Plasmid (and phages) number vary accross organisms. Some have none, while others contain a varying number of them. Therefore, the dataframes cannot be used as they are. Instead, we have to create lists, containing each plasmid found, paired with its host:
Pairs_length = []
Pairs_GC = []
Phage_Pairs_length = []
Phage_Pairs_GC = []
## The names will be needed later
Pairs_name = []
Phage_Pairs_name = []
for i in MyGC_plasmid.itertuples():
pname = ' '.join(i[1].split()[:-2])
plasmid_name = ' '.join(pname.split()[:6])
for entry in MyGC_genome.itertuples():
## clean the names from "complete genome" etc
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:5])
if host_name in plasmid_name:
if not 'putative' in plasmid_name:
Pairs_length.append([entry[2], i[2]])
Pairs_GC.append([entry[3], i[3]])
Pairs_name.append(plasmid_name)
for x in merged2_phages:
pname = ' '.join(x[0].split()[:-2])
phage_name = ' '.join(pname.split()[:5])
phage_genus = ' '.join(x[0].split()[:1])
for i in MyGC_genome2:
name = ' '.join(i[0].split()[:-2])
host_name = ' '.join(name.split()[:5])
if phage_genus in host_name:
if not 'UNVERIFIED' in phage_genus:
Phage_Pairs_length.append([i[1], x[1]])
Phage_Pairs_GC.append([i[2], x[2]])
Phage_Pairs_name.append(phage_name)
...and append these lists to new dataframes for convenience:
dfm_GC = pd.DataFrame.from_records(Pairs_GC, columns=['Host', 'Plasmid'])
dfm_phage_GC = pd.DataFrame.from_records(Phage_Pairs_GC, columns=['Host', 'Phage'])
dfm_length = pd.DataFrame.from_records(Pairs_length, columns=['Host', 'Plasmid'])
dfm_phage_length = pd.DataFrame.from_records(Phage_Pairs_length, columns=['Host', 'Phage'])
pr = stats.pearsonr(MyGC_sep.Length, MyGC_sep.GC)
fig, ax = plt.subplots()
fig.set_size_inches(12, 9)
sns.regplot(MyGC_sep.Length, MyGC_sep.GC, scatter_kws={'s':10}, ax=ax)
ax.annotate('pearsonr = %.2f;' % pr[0], xy=(1.12e7, 85), fontsize=14)
ax.annotate('p = %.3f' % pr[1], xy=(1.4e7, 85), fontsize=16)
ax.set_xlabel('Length (bp)')
ax.set_ylabel('G+C (%)')
fig.suptitle('All Data G+C content vs Length', fontsize=20, color="k")
matplotlib.pyplot.savefig('Images2/all.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images2/all.pdf', format='pdf', dpi=300, bbox_inches='tight')
sns.despine()
x_data = MyGC_genome.Length
y_data = MyGC_genome.GC
xp_data = MyGC_plasmid.Length
yp_data = MyGC_plasmid.GC
xh_data = MyGC_phages.Length
yh_data = MyGC_phages.GC
fig2 = plt.figure(figsize=(12, 9))
plt.plot(x_data, y_data, '.', label='Genomes')
plt.plot(xp_data, yp_data, '*', color='g', alpha=0.5, label='Plasmids')
plt.plot(xh_data, yh_data, 'o', color='r', alpha=0.5, label='Phages')
plt.ylabel('G+C')
plt.xlabel('Length (bp)')
plt.legend(loc=4)
fig2.suptitle('Separate Data G+C content vs Length', fontsize=20, color="k")
matplotlib.pyplot.savefig('Images/all_sep.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images/all_sep.pdf', format='pdf', dpi=300, bbox_inches='tight')
plt.show()
pr = stats.pearsonr(MyGC_genome.Length, MyGC_genome.GC)
fig, ax = plt.subplots()
fig.set_size_inches(12, 9)
axes = plt.gca()
sns.regplot(MyGC_genome.Length, MyGC_genome.GC, scatter_kws={'s':10}, ax=ax)
ax.annotate('pearsonr = %.2f;' % pr[0], xy=(0.63e7, 85), fontsize=16)
ax.annotate('p = %.3f' % pr[1], xy=(0.95e7, 85), fontsize=14)
axes.set_ylim([0,100])
ax.set_xlabel('Length (bp)')
ax.set_ylabel('G+C (%)')
fig.suptitle('Genomes G+C content vs Length', fontsize=20, color="k")
matplotlib.pyplot.savefig('Images/genomes.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images/genomes.pdf', format='pdf', dpi=300, bbox_inches='tight')
sns.despine()
ax = sns.jointplot('Length', 'GC',
data=MyGC_genome,
kind="kde",
scatter_kws={"s": 10},
size=10,
stat_func=None)
# Seaborn complains if we try to put a '+' sign in the axes titles. We must set this separately
ax.set_axis_labels(xlabel='Length', ylabel='G+C') #
pr_plas = stats.pearsonr(MyGC_plasmid.Length, MyGC_plasmid.GC)
fig2, ax = plt.subplots()
fig2.set_size_inches(12, 9)
axes = plt.gca()
sns.regplot(MyGC_plasmid.Length, MyGC_plasmid.GC, scatter_kws={'s':10}, ax=ax)
ax.annotate('pearsonr = %.2f;' % pr_plas[0], xy=(0.11e7, 85), fontsize=16)
ax.annotate('p = %.3f' % pr_plas[1], xy=(0.163e7, 85), fontsize=14)
ax.set_xlabel('Length (bp)')
ax.set_ylabel('G+C (%)')
axes.set_ylim([0,100])
fig2.suptitle('PLASMIDS G+C content vs Length', fontsize=20, color="k")
sns.despine()
matplotlib.pyplot.savefig('Images/plasmids.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images/plasmids.pdf', format='pdf', dpi=300, bbox_inches='tight')
ax = sns.jointplot('Length', 'GC',
data=MyGC_plasmid,
kind="kde",
scatter_kws={"s": 10},
size=10,
stat_func=None)
# Seaborn complains if we try to put a '+' sign in the axes titles. We must set this separately
ax.set_axis_labels(xlabel='Length', ylabel='G+C') #
pr_phag = stats.pearsonr(MyGC_phages.Length, MyGC_phages.GC)
fig3, ax = plt.subplots()
fig3.set_size_inches(12, 9)
axes = plt.gca()
sns.regplot(MyGC_phages.Length, MyGC_phages.GC, scatter_kws={'s':22}, ax=ax, robust=True)
ax.annotate('pearsonr = %.2f; p = %.3f' %(pr_phag[0], pr_phag[1]), xy=(0.330e6, 80), fontsize=14)
ax.annotate('$r^2$ = %.2f; p = %.4f' %(pr_phag[0]**2, pr_phag[1]), xy=(0.330e6, 75), fontsize=14)
ax.set_xlabel('Length (bp)')
ax.set_ylabel('G+C (%)')
axes.set_ylim([0,90])
fig3.suptitle('PHAGES G+C content vs Length', fontsize=20, color="k")
sns.despine()
matplotlib.pyplot.savefig('Images/phages.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images/phages.pdf', format='pdf', dpi=300, bbox_inches='tight')
ax = sns.jointplot('Length', 'GC',
data=MyGC_phages,
kind="kde",
scatter_kws={"s": 10},
size=10,
stat_func=None)
# Seaborn complains if we try to put a '+' sign in the axes titles. We must set this separately
ax.set_axis_labels(xlabel='Length', ylabel='G+C') #
def myplot(x, y, xlabel, ylabel, title):
plt.plot(x, y, '.')
plt.ylabel(ylabel)
plt.xlabel(xlabel)
plt.title(title)
plt.figure(figsize=(24, 40))
# Genomes log-log
plt.subplot(731)
myplot(np.log(MyGC_genome.Length), np.log(MyGC_genome.GC), 'log(Length)', 'log(G+C)', 'Genomes log-log')
# Plasmids log-log
plt.subplot(732)
myplot(np.log(MyGC_plasmid.Length), np.log(MyGC_plasmid.GC), 'log(Length)', '', 'Plasmids log-log')
# Phages log-log
plt.subplot(733)
myplot(np.log(MyGC_phages.Length), np.log(MyGC_phages.GC), 'log(Length)', '', 'Phages log-log')
#------------------------------
# Genomes half-log
plt.subplot(734)
myplot(MyGC_genome.Length, np.log(MyGC_genome.GC), 'Length', 'log(G+C)', 'Genomes half-log')
# Plasmids half-log
plt.subplot(735)
myplot(MyGC_plasmid.Length, np.log(MyGC_plasmid.GC), 'Length', '', 'Plasmids half-log')
# Phages half-log
plt.subplot(736)
myplot(MyGC_phages.Length, np.log(MyGC_phages.GC), 'Length', '', 'Phages half-log')
#----
# Genomes half-log
plt.subplot(737)
myplot(np.log(MyGC_genome.Length), MyGC_genome.GC, 'log(Length)', 'G+C', '')
# Plasmids half-log
plt.subplot(738)
myplot(np.log(MyGC_plasmid.Length), MyGC_plasmid.GC, 'log(Length)', '', '')
# Phages half-log
plt.subplot(739)
myplot(np.log(MyGC_phages.Length), MyGC_phages.GC, 'log(Length)', '', '')
#---------------------------------
# Genomes square root
plt.subplot(7,3,10)
myplot(np.sqrt(MyGC_genome.Length), np.sqrt(MyGC_genome.GC), 'sqrt(Length)', 'sqrt(G+C)', 'Genomes square root')
# Plasmids square root
plt.subplot(7,3,11)
myplot(np.sqrt(MyGC_plasmid.Length), np.sqrt(MyGC_plasmid.GC), 'sqrt(Length)', '', 'Plasmids square root')
# Phages square root
plt.subplot(7,3,12)
myplot(np.sqrt(MyGC_phages.Length), np.sqrt(MyGC_phages.GC), 'sqrt(Length)', '', 'Phages square root')
#---------------------------------
# Genomes square
plt.subplot(7,3,13)
myplot(np.square(MyGC_genome.Length), np.square(MyGC_genome.GC), 'square(Length)', 'square(G+C)', 'Genomes square')
# Plasmids square
plt.subplot(7,3,14)
myplot(np.square(MyGC_plasmid.Length), np.square(MyGC_plasmid.GC), 'square(Length)', '', 'Plasmids square')
# Phages square
plt.subplot(7,3,15)
myplot(np.square(MyGC_phages.Length), np.square(MyGC_phages.GC), 'square(Length)', '', 'Phages square')
#---------------------------------
# Genomes half-square
plt.subplot(7,3,16)
myplot(MyGC_genome.Length, np.square(MyGC_genome.GC), 'Length', 'square(G+C)', 'Genomes half-square')
# Plasmids half-square
plt.subplot(7,3,17)
myplot(MyGC_plasmid.Length, np.square(MyGC_plasmid.GC), 'Length', '', 'Plasmids half-square')
# Phages half-square
plt.subplot(7,3,18)
myplot(MyGC_phages.Length, np.square(MyGC_phages.GC), 'Length', '', 'Phages half-square')
#-----
# Genomes half-square
plt.subplot(7,3,19)
myplot(np.square(MyGC_genome.Length), MyGC_genome.GC, 'square(Length)', 'G+C', '')
# Plasmids half-square
plt.subplot(7,3,20)
myplot(np.square(MyGC_plasmid.Length), MyGC_plasmid.GC, 'square(Length)', '', '')
# Phages half-square
plt.subplot(7,3,21)
myplot(np.square(MyGC_phages.Length), MyGC_phages.GC, 'square(Length)', '', '')
plt.subplots_adjust(top=1.5, bottom=0.15, left=0.10, right=0.90, hspace=0.3, wspace=0.15)
plt.show()
Get the low-high numbers for each data set:
from IPython.display import display, Markdown, Latex
## ---------- Extreme ends of sorted dataframes (GC) ----------------
## All data
all_ext_high_GC_temp = MyGC_sep.sort_values('GC', ascending=0).head(5)
all_ext_low_GC_temp = MyGC_sep.sort_values('GC').head(5)
all_ext_high_L_temp = MyGC_sep.sort_values('Length', ascending=0).head(5)
all_ext_low_L_temp = MyGC_sep.sort_values('Length').head(5)
## Genomes
gen_ext_high_GC_temp = MyGC_genome.sort_values('GC', ascending=0).head(5)
gen_ext_low_GC_temp = MyGC_genome.sort_values('GC').head(5)
gen_ext_high_L_temp = MyGC_genome.sort_values('Length', ascending=0).head(5)
gen_ext_low_L_temp = MyGC_genome.sort_values('Length').head(5)
## Plasmids
plasm_ext_high_GC_temp = MyGC_plasmid.sort_values('GC', ascending=0).head(5)
plasm_ext_low_GC_temp = MyGC_plasmid.sort_values('GC').head(5)
plasm_ext_high_L_temp = MyGC_plasmid.sort_values('Length', ascending=0).head(5)
plasm_ext_low_L_temp = MyGC_plasmid.sort_values('Length').head(5)
## Phages
phag_ext_high_GC_temp = MyGC_phages.sort_values('GC', ascending=0).head(5)
phag_ext_low_GC_temp = MyGC_phages.sort_values('GC').head(5)
phag_ext_high_L_temp = MyGC_phages.sort_values('Length', ascending=0).head(5)
phag_ext_low_L_temp = MyGC_phages.sort_values('Length').head(5)
## ---- Lists --------
## All data
all_ext_high_GC_lst = []
all_ext_low_GC_lst = []
all_ext_high_L_lst = []
all_ext_low_L_lst = []
## Genomes
gen_ext_high_GC_lst = []
gen_ext_low_GC_lst = []
gen_ext_high_L_lst = []
gen_ext_low_L_lst = []
## Plasmids
plasm_ext_high_GC_lst = []
plasm_ext_low_GC_lst = []
plasm_ext_high_L_lst = []
plasm_ext_low_L_lst = []
## Phages
phag_ext_high_GC_lst = []
phag_ext_low_GC_lst = []
phag_ext_high_L_lst = []
phag_ext_low_L_lst = []
## ------------ Extract the data -----------------------
## All data
for item in all_ext_high_GC_temp.itertuples():
all_ext_high_GC_lst.append([str(item[1]), item[2], item[3]])
for item in all_ext_low_GC_temp.itertuples():
all_ext_low_GC_lst.append([str(item[1]), item[2], item[3]])
for item in all_ext_high_L_temp.itertuples():
all_ext_high_L_lst.append([str(item[1]), item[2], item[3]])
for item in all_ext_low_L_temp.itertuples():
all_ext_low_L_lst.append([str(item[1]), item[2], item[3]])
## Genomes
for item in gen_ext_high_GC_temp.itertuples():
gen_ext_high_GC_lst.append([str(item[1]), item[2], item[3]])
for item in gen_ext_low_GC_temp.itertuples():
gen_ext_low_GC_lst.append([str(item[1]), item[2], item[3]])
for item in gen_ext_high_L_temp.itertuples():
gen_ext_high_L_lst.append([str(item[1]), item[2], item[3]])
for item in gen_ext_low_L_temp.itertuples():
gen_ext_low_L_lst.append([str(item[1]), item[2], item[3]])
## Plasmids
for item in plasm_ext_high_GC_temp.itertuples():
plasm_ext_high_GC_lst.append([str(item[1]), item[2], item[3]])
for item in plasm_ext_low_GC_temp.itertuples():
plasm_ext_low_GC_lst.append([str(item[1]), item[2], item[3]])
for item in plasm_ext_high_L_temp.itertuples():
plasm_ext_high_L_lst.append([str(item[1]), item[2], item[3]])
for item in plasm_ext_low_L_temp.itertuples():
plasm_ext_low_L_lst.append([str(item[1]), item[2], item[3]])
## Phages
for item in phag_ext_high_GC_temp.itertuples():
phag_ext_high_GC_lst.append([str(item[1]), item[2], item[3]])
for item in phag_ext_low_GC_temp.itertuples():
phag_ext_low_GC_lst.append([str(item[1]), item[2], item[3]])
for item in phag_ext_high_L_temp.itertuples():
phag_ext_high_L_lst.append([str(item[1]), item[2], item[3]])
for item in phag_ext_low_L_temp.itertuples():
phag_ext_low_L_lst.append([str(item[1]), item[2], item[3]])
## ----------- Back to dataframes -------------------------
## All data
all_ext_high_GC_df = pd.DataFrame.from_records(all_ext_high_GC_lst, columns=['Description', 'Length', 'GC'])
all_ext_low_GC_df = pd.DataFrame.from_records(all_ext_low_GC_lst, columns=['Description', 'Length', 'GC'])
all_ext_high_L_df = pd.DataFrame.from_records(all_ext_high_L_lst, columns=['Description', 'Length', 'GC'])
all_ext_low_L_df = pd.DataFrame.from_records(all_ext_low_L_lst, columns=['Description', 'Length', 'GC'])
## Genomes
gen_ext_high_GC_df = pd.DataFrame.from_records(gen_ext_high_GC_lst, columns=['Description', 'Length', 'GC'])
gen_ext_low_GC_df = pd.DataFrame.from_records(gen_ext_low_GC_lst, columns=['Description', 'Length', 'GC'])
gen_ext_high_L_df = pd.DataFrame.from_records(gen_ext_high_L_lst, columns=['Description', 'Length', 'GC'])
gen_ext_low_L_df = pd.DataFrame.from_records(gen_ext_low_L_lst, columns=['Description', 'Length', 'GC'])
## Plasmids
plasm_ext_high_GC_df = pd.DataFrame.from_records(plasm_ext_high_GC_lst, columns=['Description', 'Length', 'GC'])
plasm_ext_low_GC_df = pd.DataFrame.from_records(plasm_ext_low_GC_lst, columns=['Description', 'Length', 'GC'])
plasm_ext_high_L_df = pd.DataFrame.from_records(plasm_ext_high_L_lst, columns=['Description', 'Length', 'GC'])
plasm_ext_low_L_df = pd.DataFrame.from_records(plasm_ext_low_L_lst, columns=['Description', 'Length', 'GC'])
## Phages
phag_ext_high_GC_df = pd.DataFrame.from_records(phag_ext_high_GC_lst, columns=['Description', 'Length', 'GC'])
phag_ext_low_GC_df = pd.DataFrame.from_records(phag_ext_low_GC_lst, columns=['Description', 'Length', 'GC'])
phag_ext_high_L_df = pd.DataFrame.from_records(phag_ext_high_L_lst, columns=['Description', 'Length', 'GC'])
phag_ext_low_L_df = pd.DataFrame.from_records(phag_ext_low_L_lst, columns=['Description', 'Length', 'GC'])
Add latex output to pandas Dataframe class:
pd.set_option('display.notebook_repr_html', True)
def _repr_latex_(self):
return "\centering{%s}" % self.to_latex()
pd.DataFrame._repr_latex_ = _repr_latex_ # monkey patch pandas DataFrame
Allow for full lenght of text (100 characters here)
pd.options.display.max_colwidth = 100
display(all_ext_high_GC_df, all_ext_low_GC_df)
display(all_ext_high_L_df, all_ext_low_L_df)
display(gen_ext_high_GC_df, gen_ext_low_GC_df)
display(gen_ext_high_L_df, gen_ext_low_L_df)
display(plasm_ext_high_GC_df, plasm_ext_low_GC_df)
display(plasm_ext_high_L_df, plasm_ext_low_L_df)
display(phag_ext_high_GC_df, phag_ext_low_GC_df)
display(phag_ext_high_L_df, phag_ext_low_L_df)
Get the means of the different data sets:
print "Means for all data: \n", MyGC_sep.mean()
print "\n"
print "Means for genomes: \n", MyGC_genome.mean()
print "\n"
print "Means for plasmids: \n", MyGC_plasmid.mean()
print "\n"
print "Means for phages: \n", MyGC_phages.mean()
In distributions with heavy skew, the median gives a better estimate of a typical value:
print "Median for all data: \n", MyGC_sep.median()
print "\n"
print "Median for genomes: \n", MyGC_genome.median()
print "\n"
print "Median for plasmids: \n", MyGC_plasmid.median()
print "\n"
print "Median for phages: \n", MyGC_phages.median()
## Genomes
genomes_sorted_gc = []
genomes_sorted_length = []
genomes_sorted_gc_temp = MyGC_genome.sort_values('GC')
genomes_sorted_length_temp = MyGC_genome.sort_values('Length')
for item in genomes_sorted_gc_temp.itertuples():
genomes_sorted_gc.append([item[3]])
for item in genomes_sorted_length_temp.itertuples():
genomes_sorted_length.append([item[2]])
## Plasmids
plasmids_sorted_gc = []
plasmids_sorted_length = []
plasmids_sorted_gc_temp = MyGC_plasmid.sort_values('GC')
plasmids_sorted_length_temp = MyGC_plasmid.sort_values('Length')
for item in plasmids_sorted_gc_temp.itertuples():
plasmids_sorted_gc.append([item[3]])
for item in plasmids_sorted_length_temp.itertuples():
plasmids_sorted_length.append([item[2]])
## Phages
phages_sorted_gc = []
phages_sorted_length = []
phages_sorted_gc_temp = MyGC_phages.sort_values('GC')
phages_sorted_length_temp = MyGC_phages.sort_values('Length')
for item in phages_sorted_gc_temp.itertuples():
phages_sorted_gc.append([item[3]])
for item in phages_sorted_length_temp.itertuples():
phages_sorted_length.append([item[2]])
plt.figure(figsize=(24, 14))
# Genomes
# Length
plt.subplot(231)
plt.plot(genomes_sorted_length)
plt.title('Genomes')
plt.ylabel('Length (bp)')
plt.grid(True)
# G+C
plt.subplot(234)
plt.plot(genomes_sorted_gc)
plt.ylabel('G+C')
plt.grid(True)
# ----------------------
# Plasmids
# Length
plt.subplot(232)
plt.plot(plasmids_sorted_length)
plt.title('Plasmids')
plt.grid(True)
# G+C
plt.subplot(235)
plt.plot(plasmids_sorted_gc)
plt.grid(True)
#-----------------------
# Phages
# Length
plt.subplot(233)
plt.plot(phages_sorted_length)
plt.title('Phages')
plt.grid(True)
# G+C
plt.subplot(236)
plt.plot(phages_sorted_gc)
plt.grid(True)
plt.subplots_adjust(top=1, bottom=0.08, left=0.10, right=0.90, hspace=0.12, wspace=0.30)
plt.show()
A large proportion of the microbial genomes in the dataset belongs to organisms that are being cultivated for many decades, such as Escerichia coli. This can affect the density of the plots. Lets see how many entries are there for each microbial species. We will select the entries from the dataset based only on the species official name, discarting information about specific strains, and we will count the entries based on frequency:
species = []
## Get the name from the description
for entry in MyGC_genome.itertuples():
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:2])
species.append(host_name)
## Create a dictionary with a species name and its frequency
c = dict()
for i in set(species):
c[i] = species.count(i)
## Convert to a pandas dataframe
s = pd.DataFrame.from_dict(c.items(), orient="columns", dtype=None)
s.columns = ['name', 'frequency']
## Sort according to frequency
dd = s.sort_values('frequency', ascending=0)
Plot the frequencies. Note that the cell must run twice to display properly
fig = plt.subplot()
#xlabels = ['name %i' % i for i in range(dd)]
#sns.countplot(x=s.values, data=s, palette="Greens_d");
sns.set_context({"figure.figsize": (24, 10)})
g = sns.barplot(x='name', y='frequency', data=dd.head(50))
#g.set_xticks('name')
fig.set_xticklabels(dd['name'], rotation=90)
fig.set_xlabel('species name', fontsize=20, color="k")
fig.set_ylabel('frequency (counts)', fontsize=20, color="k")
fig.set_title('Top 50 sequenced microbial species', fontsize=30, color="k")
GC content variance for the whole dataset
np.var(MyGC_sep.GC)
We will create a list with unique names, as before, and for each occurrence of a name in the dataframe will take the GC content value, store them all in a separate temporary list and calculate the variance. Then store the groups name with the variance in a new list:
species2 = [] ## Get unigue species names
for entry in MyGC_genome.itertuples():
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:2]) ## Select only the first word of the entry
if not 'Bacterium' in host_name: ## I noticed entries with these names creeping up, so I exlude them here
if not 'Endosymbiont' in host_name:
if not 'endosymbiont' in host_name:
species2.append(host_name)
names_var = []
for s in species2: ## For each unique name in the names list
count = [] ## list to hold the GC values
sp = str(s)
for entry in MyGC_genome.itertuples(): ## Get the name
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:2])
if sp in host_name:
count.append([entry.GC]) ## Store GC value in "count"
n_var = np.var(count) ## Calculate the variance of the entries in "count"
names_var.append([s, n_var]) ## Store the variance value with the unique name
s2 = pd.DataFrame.from_records(names_var)
s2.columns = ['name', 'variance']
dd2 = s2.sort_values('variance', ascending=0)
dd2[dd2.variance>0].head(10) ## change this number to see less or more
dd2.head(10)
dd3 = s2.sort_values('variance', ascending=True)
dd3[dd2.variance>0.1].head(30)
## HC0=Heteroskedasticity-corrected OLS
## HC3= Heteroskedasticity-corrected and good for small samples --throws memory error
MyGC_genome_con = sm.add_constant(MyGC_genome.Length) # Adds a constant so we can fit the intercept
gen_host_corrected = sm.OLS(MyGC_genome.GC, MyGC_genome_con).fit(cov_type='HC0', use_t=True)
print(gen_host_corrected.summary())
X = dfm_GC.Host
y = dfm_GC.Plasmid
X = sm.add_constant(X) # Adds a constant so we can fit the intercept
Pl_host_result_ols = sm.OLS(y, X).fit(cov_type='HC0', use_t=True)
print("Model parameters: \n%s\n" % Pl_host_result_ols.params)
print(Pl_host_result_ols.summary())
pr = stats.pearsonr(dfm_GC.Host, dfm_GC.Plasmid)
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(dfm_GC.Host, dfm_GC.Plasmid)
fig, ax = plt.subplots()
fig.set_size_inches(10, 10)
axes = plt.gca()
sns.regplot(dfm_GC.Host, dfm_GC.Plasmid, scatter_kws={'s':10}, ax=ax, robust=True)
ax.annotate('pearsonr = %.2f; p = %.4f' %(pr[0], pr[1]), xy=(50, 25), fontsize=14)
ax.annotate('$r^2$ = %.2f' %Pl_host_result_ols.rsquared, xy=(50, 23), fontsize=14)
ax.annotate('$y=%3.5sx%3.5s$'%(slope, intercept), xy=(50, 27), fontsize=14)
ax.set_xlabel('Host G+C (%)')
ax.set_ylabel('Plasmid G+C (%)')
fig.suptitle('Plasmid G+C content vs Host G+C', fontsize=20, color="k")
matplotlib.pyplot.savefig('Images2/plasmid_host.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images2/plasmid_host.pdf', format='pdf', dpi=300, bbox_inches='tight')
sns.despine()
X2 = dfm_phage_GC.Host
y2 = dfm_phage_GC.Phage
X2 = sm.add_constant(X2) # Adds a constant so we can fit the intercept
phages_corrected = sm.OLS(y2, X2).fit(cov_type='HC0', use_t=True)
print phages_corrected.summary()
pr_phage = stats.pearsonr(dfm_phage_GC.Host, dfm_phage_GC.Phage)
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(dfm_phage_GC.Host, dfm_phage_GC.Phage)
fig, ax = plt.subplots()
fig.set_size_inches(10, 10)
axes = plt.gca()
sns.regplot(dfm_phage_GC.Host, dfm_phage_GC.Phage, scatter_kws={'s':10}, ax=ax, robust=True)
ax.annotate('pearsonr = %.2f; p = %.4f' %(pr_phage[0], pr_phage[1]), xy=(30, 68), fontsize=14)
ax.annotate('$r^2$ = %.2f' %pr_phage[0]**2, xy=(30, 66), fontsize=14)
ax.annotate('$y=%3.5sx+%3.5s$'%(slope, intercept), xy=(30, 70), fontsize=14)
ax.set_xlabel('Host G+C (%)')
ax.set_ylabel('Phage G+C (%)')
fig.suptitle('Phage G+C content vs Host G+C', fontsize=20, color="k")
matplotlib.pyplot.savefig('Images/phage_host.png', bbox_inches='tight')
matplotlib.pyplot.savefig('Images/phage_host.pdf', format='pdf', dpi=300, bbox_inches='tight')
sns.despine()
from bokeh.io import push_notebook, output_notebook, show
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, BoxZoomTool, PanTool
# Display bokeh graphics in this notebook
output_notebook()
desc_int = []
## Get the name from the description
for entry in MyGC_genome.itertuples():
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:2])
length = entry.Length
gc = entry.GC
desc_int.append([host_name, length, gc])
source = ColumnDataSource(dict(
x = [i[1] for i in desc_int],
y = [i[2] for i in desc_int],
name = [i[0] for i in desc_int]))
# make the hover tool to show custom information
hover = HoverTool(tooltips = "@name")
# create a new plot with tools
p = figure(plot_width=900, plot_height=700, tools=[hover, 'reset', 'zoom_in', 'zoom_out', 'pan', 'box_zoom'])
# add a circle renderer to create the scatter plot
p.circle('x', 'y', size=4, source=source)
p.title.text = "Bacterial genomic length versus G+C content"
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
show(p, browser="opera") # Show the results. Change the browser here accordingly
plasm_int = []
## Get the name from the description
for entry in MyGC_plasmid.itertuples():
name = ' '.join(entry[1].split()[:-2])
host_name = ' '.join(name.split()[:2])
length = entry.Length
gc = entry.GC
plasm_int.append([host_name, length, gc])
source = ColumnDataSource(dict(
x = [i[1] for i in plasm_int],
y = [i[2] for i in plasm_int],
name = [i[0] for i in plasm_int]))
# make the hover tool to show custom information
hover = HoverTool(tooltips = "@name")
# create a new plot with tools
p = figure(plot_width=900, plot_height=700, tools=[hover, 'reset', 'zoom_in', 'zoom_out', 'pan', 'box_zoom'])
# add a circle renderer to create the scatter plot
p.circle('x', 'y', size=4, source=source)
p.title.text = "Plasmid sequence length versus G+C content"
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
show(p, browser="opera") # Show the results. Change the browser here accordingly
We will use Pairs_length, Pairs_GC and Pairs_name from above
There is a memory error from the cell bellow due to a low io data rate limit. To avoid it, start jupyter notebook with:
jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000
source = ColumnDataSource(dict(
host = [i[0] for i in Pairs_GC],
plasmid = [i[1] for i in Pairs_GC],
name = [i for i in Pairs_name]))
# make the hover tool to show custom information
hover = HoverTool(tooltips = "@name")
# create a new plot with tools
p = figure(plot_width=900, plot_height=700, tools=[hover, 'reset', 'zoom_in', 'zoom_out', 'pan', 'box_zoom'])
# add a circle renderer to create the scatter plot
p.circle('host', 'plasmid', size=4, source=source)
p.title.text = "Plasmid G+C versus host G+C content"
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
show(p, browser="opera") # Show the results. Change the browser here accordingly
phage_int = []
## Get the name from the description
for entry in MyGC_phages.itertuples():
name = ' '.join(entry[1].split()[:-2])
phage_name = ' '.join(name.split()[:5])
length = entry.Length
gc = entry.GC
phage_int.append([phage_name, length, gc])
source = ColumnDataSource(dict(
x = [i[1] for i in phage_int],
y = [i[2] for i in phage_int],
name = [i[0] for i in phage_int]))
# make the hover tool to show custom information
hover = HoverTool(tooltips = "@name")
# create a new plot with tools
p = figure(plot_width=900, plot_height=700, tools=[hover, 'reset', 'zoom_in', 'zoom_out', 'pan', 'box_zoom'])
# add a circle renderer to create the scatter plot
p.circle('x', 'y', size=4, source=source)
p.title.text = "Bacteriophage genomic length versus G+C content"
p.title.align = "center"
p.title.text_color = "black"
p.title.text_font_size = "25px"
show(p, browser="opera") # Show the results. Change the browser here accordingly